# Load the libraries we will need
suppressPackageStartupMessages({
library(tidyverse)
library(dplyr)
library(plotly)
library(ggplot2)
library(viridis)
library(naniar)
library(hrbrthemes)
})
# remove warning messages for exporting
options(warn = -1)
Foundation Foods includes values for nutrients and other food components for a diverse range of basic foods (unprocessed or lightly processed foods) and provides extensive underlying metadata, including the number of samples, sampling location, date of collection, analytical approaches used, and if appropriate, agricultural information such as genotype and production practices.
# Import our dataset and clean the names of the columns
food_composition_raw <- read.csv("/kaggle/input/composition-of-foods-integrated-dataset-cofid/McCance_Widdowsons_Composition_of_Foods_Integrated_Dataset_2021.csv")%>%
janitor::clean_names()
# Let's have a look at our data
head(food_composition_raw)
glimpse(food_composition_raw)
| food_code | food_name | description | group | previous | main_data_references | footnote | water_g | total_nitrogen_g | protein_g | ⯠| cholesterol_mg | x | x_1 | x_2 | x_3 | x_4 | x_5 | x_6 | x_7 | x_8 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| <chr> | <chr> | <chr> | <chr> | <chr> | <chr> | <chr> | <chr> | <chr> | <chr> | ⯠| <chr> | <lgl> | <lgl> | <lgl> | <lgl> | <lgl> | <lgl> | <lgl> | <lgl> | <lgl> | |
| 1 | WATER | TOTNIT | PROT | ⯠| CHOL | NA | NA | NA | NA | NA | NA | NA | NA | NA | |||||||
| 2 | Water | Total nitrogen | Protein | ⯠| Cholesterol | NA | NA | NA | NA | NA | NA | NA | NA | NA | |||||||
| 3 | 13-145 | Ackee, canned, drained | 8 cans | DG | 554 | MW4, 1978; and Vegetables, Herbs and Spices Supplement, 1991 | 76.7 | 0.46 | 2.9 | ⯠| 0.0 | NA | NA | NA | NA | NA | NA | NA | NA | NA | |
| 4 | 13-146 | Agar, dried | Literature sources | DG | Wu Leung et al. (1972) Food composition table for use in East Asia, Food and Agriculature Organization and US Department of Health | 9.7 | 0.26 | 1.3 | ⯠| 0.0 | NA | NA | NA | NA | NA | NA | NA | NA | NA | ||
| 5 | 13-147 | Agar, dried, soaked and drained | Literature sources | DG | Wu Leung et al. (1972) Food composition table for use in East Asia, Food and Agriculature Organization and US Department of Health | 84.2 | 0.03 | 0.2 | ⯠| 0.0 | NA | NA | NA | NA | NA | NA | NA | NA | NA | ||
| 6 | 13-148 | Alfalfa sprouts, raw | Analytical and literature sources | DG | Vegetables, Herbs and Spices Supplement, 1991 | 93.4 | 0.64 | 4.0 | ⯠| 0.0 | NA | NA | NA | NA | NA | NA | NA | NA | NA |
Rows: 2,889 Columns: 56 $ food_code <chr> "", "", "13-145", "13-146", "13-147", "1⦠$ food_name <chr> "", "", "Ackee, canned, drained", "Agar,⦠$ description <chr> "", "", "8 cans", "Literature sources", ⦠$ group <chr> "", "", "DG", "DG", "DG", "DG", "H", "GA⦠$ previous <chr> "", "", "554", "", "", "", "", "14-801 1⦠$ main_data_references <chr> "", "", "MW4, 1978; and Vegetables, Herb⦠$ footnote <chr> "", "", "", "", "", "", "", "", "", "", ⦠$ water_g <chr> "WATER", "Water", "76.7", "9.7", "84.2",⦠$ total_nitrogen_g <chr> "TOTNIT", "Total nitrogen", "0.46", "0.2⦠$ protein_g <chr> "PROT", "Protein", "2.9", "1.3", "0.2", ⦠$ fat_g <chr> "FAT", "Fat", "15.2", "1.2", "0.1", "0.7⦠$ carbohydrate_g <chr> "CHO", "Carbohydrate", "0.8", "Tr", "Tr"⦠$ energy_kcal_kcal <chr> "KCALS", "kcal", "151", "16", "2", "24",⦠$ energy_k_j_k_j <chr> "KJ", "kJ", "625", "67", "7", "100", "N"⦠$ starch_g <chr> "STAR", "Starch", "Tr", "0.0", "0.0", "T⦠$ oligosaccharide_g <chr> "OLIGO", "Oligosaccharide", "", "", "", ⦠$ total_sugars_g <chr> "TOTSUG", "Total sugars", "0.8", "Tr", "⦠$ glucose_g <chr> "GLUC", "Glucose", "0.1", "0.0", "0.0", ⦠$ galactose_g <chr> "GALACT", "Galactose", "0.0", "0.0", "0.⦠$ fructose_g <chr> "FRUCT", "Fructose", "Tr", "0.0", "0.0",⦠$ sucrose_g <chr> "SUCR", "Sucrose", "0.7", "0.0", "0.0", ⦠$ maltose_g <chr> "MALT", "Maltose", "0.0", "0.0", "0.0", ⦠$ lactose_g <chr> "LACT", "Lactose", "0.0", "0.0", "0.0", ⦠$ alcohol_g <chr> "ALCO", "Alcohol", "", "", "", "", "", "⦠$ nsp_g <chr> "ENGFIB", "Non-starch polysaccharide", "⦠$ aoac_fibre_g <chr> "AOACFIB", "AOAC fibre", "", "", "", "",⦠$ satd_fa_100g_fa_g <chr> "SATFAC", "Saturated fatty acids per 100⦠$ satd_fa_100g_fd_g <chr> "SATFOD", "Saturated fatty acids per 100⦠$ n_6_poly_100g_fa_g <chr> "TOTn6PFAC", "Total n-6 polyunsaturated ⦠$ n_6_poly_100g_food_g <chr> "TOTn6PFOD", "Total n-6 polyunsaturated ⦠$ n_3_poly_100g_fa_g <chr> "TOTn3PFAC", "Total n-3 polyunsaturated ⦠$ n_3_poly_100g_food_g <chr> "TOTn3PFOD", "Total n-3 polyunsaturated ⦠$ cis_mono_fa_100g_fa_g <chr> "MONOFACc", "cis-Monounsaturated fatty a⦠$ cis_mono_fa_100g_food_g <chr> "MONOFODc", "cis-Monounsaturated fatty a⦠$ mono_fa_100g_fa_g <chr> "MONOFAC", "Monounsaturated fatty acids ⦠$ mono_fa_100g_food_g <chr> "MONOFOD", "Monounsaturated fatty acids ⦠$ cis_polyu_fa_100g_fa_g <chr> "POLYFACc", "cis-Polyunsaturated fatty a⦠$ cis_poly_fa_100g_food_g <chr> "POLYFODc", "cis-Polyunsaturated fatty a⦠$ poly_fa_100g_fa_g <chr> "POLYFAC", "Polyunsaturated fatty acids ⦠$ poly_fa_100g_food_g <chr> "POLYFOD", "Polyunsaturated fatty acids ⦠$ sat_fa_excl_br_100g_fa_g <chr> "SATFACx6", "Saturated fatty acids exclu⦠$ sat_fa_excl_br_100g_food_g <chr> "SATFODx6", "Saturated fatty acids exclu⦠$ branched_chain_fa_100g_fa_g <chr> "TOTBRFAC", "Total branched chain per 10⦠$ branched_chain_fa_100g_food_g <chr> "TOTBRFOD", "Total branched chain per 10⦠$ trans_f_as_100g_fa_g <chr> "FACTRANS", "Total Trans fatty acids per⦠$ trans_f_as_100g_food_g <chr> "FODTRANS", "Total Trans fatty acids per⦠$ cholesterol_mg <chr> "CHOL", "Cholesterol", "0.0", "0.0", "0.⦠$ x <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ⦠$ x_1 <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ⦠$ x_2 <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ⦠$ x_3 <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ⦠$ x_4 <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ⦠$ x_5 <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ⦠$ x_6 <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ⦠$ x_7 <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ⦠$ x_8 <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ā¦
# Remove unwanted rows and select columns
food_composition <- food_composition_raw %>%
# Removes rows 1 and 2 which are descriptive, hence unecessary in our case, using dplyr's slice
slice(-c(1, 2)) %>%
# Remove herbs and spices (H), flours and grains (AA) as they are food or drink additives
# used in nutritionally insignificant quantities for flavoring or coloring, and not standalone foods
filter(!(group %in% c('H','AA'))) %>%
# Select the columns we are interested in
select(food_name, aoac_fibre_g, energy_kcal_kcal, total_sugars_g, group)
# Let's have a look at our subset
head(food_composition)
glimpse(food_composition)
| food_name | aoac_fibre_g | energy_kcal_kcal | total_sugars_g | group | |
|---|---|---|---|---|---|
| <chr> | <chr> | <chr> | <chr> | <chr> | |
| 1 | Ackee, canned, drained | 151 | 0.8 | DG | |
| 2 | Agar, dried | 16 | Tr | DG | |
| 3 | Agar, dried, soaked and drained | 2 | Tr | DG | |
| 4 | Alfalfa sprouts, raw | 24 | 0.3 | DG | |
| 5 | Almonds, flaked and ground | N | 612 | 4.2 | GA |
| 6 | Almonds, toasted | 10.9 | 579 | 5.1 | GA |
Rows: 2,797 Columns: 5 $ food_name <chr> "Ackee, canned, drained", "Agar, dried", "Agar, dried⦠$ aoac_fibre_g <chr> "", "", "", "", "N", "10.9", "4.6", "12.5", "", "", "⦠$ energy_kcal_kcal <chr> "151", "16", "2", "24", "612", "579", "205", "554", "⦠$ total_sugars_g <chr> "0.8", "Tr", "Tr", "0.3", "4.2", "5.1", "1.6", "4.5",⦠$ group <chr> "DG", "DG", "DG", "DG", "GA", "GA", "GA", "GA", "DG",ā¦
We notice there are either empty data values or given as 'N', which stands for not available. Let's have a look at how missing values are given in the data points, for example in the 'aoac_fibre_g' column:
# Write out all the offending strings using the built-in strings given in naniar
common_na_strings
- 'missing'
- 'NA'
- 'N A'
- 'N/A'
- '#N/A'
- 'NA '
- ' NA'
- 'N /A'
- 'N / A'
- ' N / A'
- 'N / A '
- 'na'
- 'n a'
- 'n/a'
- 'na '
- ' na'
- 'n /a'
- 'n / a'
- ' a / a'
- 'n / a '
- 'NULL'
- 'null'
- ''
- '\\?'
- '\\*'
- '\\.'
common_na_strings_exp <- c(common_na_strings, 'Tr')
common_na_strings_exp
- 'missing'
- 'NA'
- 'N A'
- 'N/A'
- '#N/A'
- 'NA '
- ' NA'
- 'N /A'
- 'N / A'
- ' N / A'
- 'N / A '
- 'na'
- 'n a'
- 'n/a'
- 'na '
- ' na'
- 'n /a'
- 'n / a'
- ' a / a'
- 'n / a '
- 'NULL'
- 'null'
- ''
- '\\?'
- '\\*'
- '\\.'
- 'Tr'
# Check how NA values are given
# Count the na_strings instances
print(paste("Total na offending string instances", sum(grepl(paste(common_na_strings_exp, collapse = "|"), food_composition))))
# Out of which given as 'NA' are
print(paste("Given as 'NA' are", sum(is.na(food_composition), na.rm = FALSE)))
# While given as '' are
print(paste("Given as '' are", sum(food_composition == '', na.rm = TRUE)))
# While given as 'N' are
print(paste("Given as 'N' are", sum(food_composition == 'N', na.rm = TRUE)))
# And null are:
print(paste("Null are", sum(is.null(food_composition), na.rm = TRUE)))
[1] "Total na offending string instances 5" [1] "Given as 'NA' are 0" [1] "Given as '' are 827" [1] "Given as 'N' are 519" [1] "Null are 0"
Let's make sure all missing values are given as NA, rather than e.g. "N":
# Use the naniar library to replace all common na string values with 'NA'
food_composition <- replace_with_na_all(food_composition, condition = ~.x %in% common_na_strings_exp)
# Check whether it worked
sum(is.na(food_composition), na.rm = FALSE)
Finally, let's convert the fiber columns to decimal type and the kcal column to integer:
food_composition <- food_composition %>%
mutate(
# Convert fibre and sugar values to numeric (decimal)
aoac_fibre_g = as.numeric(aoac_fibre_g),
total_sugars_g = as.numeric(total_sugars_g),
# Convert kcal values to integer
energy_kcal_kcal = as.integer(energy_kcal_kcal)
)
glimpse(food_composition)
Rows: 2,797 Columns: 5 $ food_name <chr> "Ackee, canned, drained", "Agar, dried", "Agar, dried⦠$ aoac_fibre_g <dbl> NA, NA, NA, NA, NA, 10.9, 4.6, 12.5, NA, NA, NA, 0.0,⦠$ energy_kcal_kcal <int> 151, 16, 2, 24, 612, 579, 205, 554, 16, 18, 58, 191, ⦠$ total_sugars_g <dbl> 0.8, NA, NA, 0.3, 4.2, 5.1, 1.6, 4.5, 0.2, 0.2, NA, 0⦠$ group <chr> "DG", "DG", "DG", "DG", "GA", "GA", "GA", "GA", "DG",ā¦
# Let's check our subset at this point
head(food_composition)
| food_name | aoac_fibre_g | energy_kcal_kcal | total_sugars_g | group |
|---|---|---|---|---|
| <chr> | <dbl> | <int> | <dbl> | <chr> |
| Ackee, canned, drained | NA | 151 | 0.8 | DG |
| Agar, dried | NA | 16 | NA | DG |
| Agar, dried, soaked and drained | NA | 2 | NA | DG |
| Alfalfa sprouts, raw | NA | 24 | 0.3 | DG |
| Almonds, flaked and ground | NA | 612 | 4.2 | GA |
| Almonds, toasted | 10.9 | 579 | 5.1 | GA |
To get an understanding of the number of missing values, we'll use the Amelia library:
library(Amelia)
# Make sure the graph is visible enough
options(repr.plot.width = 14, repr.plot.height = 12)
missmap(food_composition, main="Food Composition - Missing Values",col=c("#ffb55a","#7eb0d5"),legend=TRUE)
There are many fibre values missing, however those in place allows us to still work on the dataset, so let's remove them
food_composition <- food_composition %>% drop_na()
# Check na values in the dataset now
missmap(food_composition, main="Food Composition - Missing Values",col=c("#ffb55a","#7eb0d5"),legend=TRUE)
# Let's see if our dataset makes sense
head(food_composition[order(-food_composition$aoac_fibre_g),])
| food_name | aoac_fibre_g | energy_kcal_kcal | total_sugars_g | group |
|---|---|---|---|---|
| <chr> | <dbl> | <int> | <dbl> | <chr> |
| Green beans, dried | 35.0 | 246 | 22.6 | DI |
| Breakfast cereal, bran type cereal, fortified | 24.6 | 267 | 20.0 | AI |
| Crispbread, rye | 20.0 | 284 | 3.4 | AM |
| Onions, dried, raw | 19.3 | 309 | 54.4 | DG |
| Lentils, red, split, dried, raw | 17.4 | 311 | 1.3 | DB |
| Breakfast cereal, bran flakes, fortified | 13.4 | 333 | 21.0 | AI |
# Let's create a named list to match food abbreviations to their printed names
lookup_list = c(
'AA' = 'Flours, grains and starches',
'AB' = 'Sandwiches',
'AC' = 'Rice',
'AD' = 'Pasta',
'AE' = 'Pizzas',
'AF' = 'Breads',
'AG' = 'Rolls',
'AI' = 'Breakfast cereals',
'AK' = 'Infant cereal foods',
'AM' = 'Biscuits',
'AN' = 'Cakes',
'AO' = 'Pastry',
'AP' = 'Buns and pastries',
'AS' = 'Puddings',
'AT' = 'Savouries',
'BA' = 'Cows milk',
'BAB' = 'Breakfast milk',
'BAE' = 'Skimmed milk',
'BAH' = 'Semi-skimmed milk',
'BAK' = 'Whole milk',
'BAN' = 'Channel Island milk',
'BAR' = 'Processed milks',
'BC' = 'Other milks',
'BF' = 'Infant formulas',
'BFD' = 'Whey-based modified milks',
'BFG' = 'Non-whey-based modified milks',
'BFJ' = 'Soya-based modified milks',
'BFP' = 'Follow-on formulas',
'BH' = 'Milk-based drinks',
'BJ' = 'Creams',
'BJC' = 'Fresh creams (pasteurised)',
'BJF' = 'Frozen creams (pasteurised)',
'BJL' = 'Sterilised creams',
'BJP' = 'UHT creams',
'BJS' = 'Imitation creams',
'BL' = 'Cheeses',
'BN' = 'Yogurts',
'BNE' = 'Whole milk yogurts',
'BNH' = 'Low fat yogurts',
'BNS' = 'Other yogurts',
'BP' = 'Ice creams',
'BR' = 'Puddings and chilled desserts',
'BV' = 'Savoury dishes and sauces',
'CA' = 'Eggs',
'CD' = 'Egg dishes',
'CDE' = 'Savoury egg dishes',
'CDH' = 'Sweet egg dishes',
'DA' = 'Potatoes',
'DAE' = 'Early potatoes',
'DAM' = 'Main crop potatoes',
'DAP' = 'Chipped old potatoes',
'DAR' = 'Potato products',
'DB' = 'Beans and lentils',
'DF' = 'Peas',
'DG' = 'Vegetables, general',
'DI' = 'Vegetables, dried',
'DR' = 'Vegetable dishes',
'FA' = 'Fruit, general',
'FC' = 'Fruit juices',
'GA' = 'Nuts and seeds, general',
'JA' = 'White fish',
'JC' = 'Fatty fish',
'JK' = 'Crustacea',
'JM' = 'Molluscs',
'JR' = 'Fish products and dishes',
'MA' = 'Meat',
'MAA' = 'Bacon',
'MAC' = 'Beef',
'MAE' = 'Lamb',
'MAG' = 'Pork',
'MAI' = 'Veal',
'MC' = 'Poultry',
'MCA' = 'Chicken',
'MCC' = 'Duck',
'MCE' = 'Goose',
'MCG' = 'Grouse',
'MCI' = 'Partridge',
'MCK' = 'Pheasant',
'MCM' = 'Pigeon',
'MCO' = 'Turkey',
'ME' = 'Game',
'MEA' = 'Hare',
'MEC' = 'Rabbit',
'MEE' = 'Venison',
'MG' = 'Offal',
'MBG' = 'Burgers and grillsteaks',
'MI' = 'Meat products',
'MIG' = 'Other meat products',
'MR' = 'Meat dishes',
'OA' = 'Spreading fats',
'OB' = 'Animal fats',
'OC' = 'Oils',
'OE' = 'Non-animal fats',
'OF' = 'Cooking fats',
'PA' = 'Powdered drinks, essences and infusions',
'PAA' = 'Powdered drinks and essences',
'PAC' = 'Infusions',
'PC' = 'Soft drinks',
'PCA' = 'Carbonated drinks',
'PCC' = 'Squash and cordials',
'PE' = 'Juices',
'QA' = 'Beers',
'QC' = 'Ciders',
'QE' = 'Wines',
'QF' = 'Fortified wines',
'QG' = 'Vermouths',
'QI' = 'Liqueurs',
'QK' = 'Spirits',
'SC' = 'Sugars, syrups and preserves',
'SE' = 'Confectionery',
'SEA' = 'Chocolate confectionery',
'SEC' = 'Non-chocolate confectionery',
'SN' = 'Savoury snacks',
'SNA' = 'Potato-based snacks',
'SNB' = 'Potato and mixed cereal snacks',
'SNC' = 'Non-potato snacks',
'WA' = 'Soups',
'WAA' = 'Homemade soups',
'WAC' = 'Canned soups',
'WAE' = 'Packet soups',
'WC' = 'Sauces',
'WCD' = 'Dairy sauces',
'WCG' = 'Salad sauces, dressings and pickles',
'WCN' = 'Non-salad sauces',
'WE' = 'Pickles and chutneys',
'WY' = 'Miscellaneous foods')
lookup_list['WA']
typeof(lookup_list)
head(lookup_list)
- AA
- 'Flours, grains and starches'
- AB
- 'Sandwiches'
- AC
- 'Rice'
- AD
- 'Pasta'
- AE
- 'Pizzas'
- AF
- 'Breads'
food_composition <- food_composition[food_composition$group %in% names(lookup_list), ]
food_composition$group_full_name <- sapply(food_composition$group, function(abbrev) lookup_list[[abbrev]])
head(food_composition)
| food_name | aoac_fibre_g | energy_kcal_kcal | total_sugars_g | group | group_full_name |
|---|---|---|---|---|---|
| <chr> | <dbl> | <int> | <dbl> | <chr> | <chr> |
| Almonds, toasted | 10.9 | 579 | 5.1 | GA | Nuts and seeds, general |
| Almonds, weighed with shells | 4.6 | 205 | 1.6 | GA | Nuts and seeds, general |
| Almonds, whole kernels | 12.5 | 554 | 4.5 | GA | Nuts and seeds, general |
| Anchovies, canned in oil, drained | 0.0 | 191 | 0.0 | JC | Fatty fish |
| Apple sauce, homemade | 1.4 | 79 | 20.2 | WC | Sauces |
| Apples, cooking, baked with sugar, flesh only | 1.9 | 69 | 17.1 | FA | Fruit, general |
We have too many values to plot on a graph, so a grouping could make sense. We'll use the groups already provided with the dataset.
food_composition_by_g <- food_composition %>%
group_by(group_full_name) %>%
summarise(
fibre_group_average = as.numeric(format(round(mean(aoac_fibre_g), 3)), nsmall = 3),
sugar_group_average = as.numeric(format(round(mean(total_sugars_g), 3)), nsmall = 3))
head(food_composition_by_g)
| group_full_name | fibre_group_average | sugar_group_average |
|---|---|---|
| <chr> | <dbl> | <dbl> |
| Bacon | 0.005 | 0.118 |
| Beans and lentils | 6.736 | 1.659 |
| Beef | 0.000 | 0.000 |
| Biscuits | 4.017 | 24.795 |
| Breads | 3.868 | 3.844 |
| Breakfast cereals | 6.284 | 17.861 |
food_composition_by_g <- food_composition_by_g[order(-food_composition_by_g$fibre_group_average),]
head(food_composition_by_g, 30)
| group_full_name | fibre_group_average | sugar_group_average |
|---|---|---|
| <chr> | <dbl> | <dbl> |
| Vegetables, dried | 22.000 | 31.000 |
| Non-potato snacks | 8.400 | 1.900 |
| Savoury snacks | 7.114 | 3.829 |
| Beans and lentils | 6.736 | 1.659 |
| Breakfast cereals | 6.284 | 17.861 |
| Nuts and seeds, general | 5.760 | 10.413 |
| Peas | 4.370 | 3.290 |
| Rolls | 4.040 | 3.280 |
| Biscuits | 4.017 | 24.795 |
| Breads | 3.868 | 3.844 |
| Potato products | 3.800 | 1.233 |
| Potato-based snacks | 3.650 | 1.200 |
| Chipped old potatoes | 3.367 | 0.922 |
| Pastry | 3.247 | 1.073 |
| Packet soups | 3.050 | 9.550 |
| Pasta | 3.018 | 2.218 |
| Vegetables, general | 2.720 | 4.901 |
| Potatoes | 2.700 | 0.800 |
| Pizzas | 2.560 | 2.560 |
| Buns and pastries | 2.546 | 13.219 |
| Infant cereal foods | 2.400 | 23.400 |
| Vegetable dishes | 2.351 | 3.178 |
| Main crop potatoes | 2.283 | 1.250 |
| Chocolate confectionery | 2.230 | 55.170 |
| Fruit, general | 2.190 | 14.770 |
| Early potatoes | 2.100 | 1.125 |
| Rice | 2.053 | 0.874 |
| Puddings | 2.003 | 21.903 |
| Powdered drinks and essences | 1.957 | 26.907 |
| Cakes | 1.852 | 34.141 |
# Create a box plot for aoac_fibre_g
box_plot <- ggplot(data = food_composition_by_g, aes(y = fibre_group_average, text = paste("Name:", group_full_name,
"<br>Fiber:", fibre_group_average))) +
geom_boxplot(fill = "lightblue", color = "black", outlier.colour = "red") + # Box plot appearance
labs(title = "Box Plot of AOAC Fiber (g)",
y = "AOAC Fiber (g)") +
theme_minimal() # Optional: use a minimal theme
# Convert ggplot to plotly for interactivity
fig <- ggplotly(box_plot, height = 600, width = 800, tooltip = "text")
fig
# use options!
scatter_plot <- ggplot(food_composition_by_g, aes(x=fibre_group_average, y=sugar_group_average, text = paste("Name:", group_full_name,
"<br>Fiber:", fibre_group_average, "<br>Sugar:", sugar_group_average))) +
geom_point(
color="black",
fill="#69b3a2",
shape=22,
alpha=0.5,
size=2,
stroke = .2
) +
theme_ipsum()
# Convert ggplot to plotly for interactivity
fig_correl <- ggplotly(scatter_plot, height = 600, width = 800, tooltip = "text")
fig_correl
write.csv(food_composition, file = "/kaggle/working/food_composition_cleaned.csv", row.names = FALSE)
write.csv(food_composition_by_g, file = "/kaggle/working/food_composition_grouped.csv", row.names = FALSE)